library(ISLR)
attach(Auto)
bin <- ifelse(Auto$mpg > median(Auto$mpg) , 1, 0)
Auto$mpglevel <- as.factor(bin)
set.seed(1234)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'Auto':
##
## mpg
library(e1071)
autosample <- sample(1:nrow(Auto), nrow(Auto)/2)
train_data <- Auto[autosample,]
test_data <- Auto[-autosample,]
svm_model <-tune(svm, mpglevel ~ ., data = train_data, kernel = "linear", ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))
summary(svm_model)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 5
##
## - best performance: 0.01552632
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.08684211 0.05881774
## 2 0.10 0.07157895 0.06458905
## 3 1.00 0.02552632 0.02692425
## 4 5.00 0.01552632 0.02501000
## 5 10.00 0.01552632 0.02501000
From above results we can see at cost 5 and 10 we get least errror of 0.015
svm_model_polynomial <-tune(svm, mpglevel ~ ., data = train_data, kernel = "polynomial", ranges = list(cost = c(0.01, 0.1, 1, 5, 10),degree=c(2,3,4,5)))
summary(svm_model_polynomial)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost degree
## 0.01 2
##
## - best performance: 0.4797368
##
## - Detailed performance results:
## cost degree error dispersion
## 1 0.01 2 0.4797368 0.05391776
## 2 0.10 2 0.4797368 0.05391776
## 3 1.00 2 0.4797368 0.05391776
## 4 5.00 2 0.4797368 0.05391776
## 5 10.00 2 0.4797368 0.05391776
## 6 0.01 3 0.4797368 0.05391776
## 7 0.10 3 0.4797368 0.05391776
## 8 1.00 3 0.4797368 0.05391776
## 9 5.00 3 0.4797368 0.05391776
## 10 10.00 3 0.4797368 0.05391776
## 11 0.01 4 0.4797368 0.05391776
## 12 0.10 4 0.4797368 0.05391776
## 13 1.00 4 0.4797368 0.05391776
## 14 5.00 4 0.4797368 0.05391776
## 15 10.00 4 0.4797368 0.05391776
## 16 0.01 5 0.4797368 0.05391776
## 17 0.10 5 0.4797368 0.05391776
## 18 1.00 5 0.4797368 0.05391776
## 19 5.00 5 0.4797368 0.05391776
## 20 10.00 5 0.4797368 0.05391776
svm_model_radial <-tune(svm, mpglevel ~ ., data = train_data, kernel = "radial", ranges = list(cost = c(0.01, 0.1, 1, 5, 10),gamma=c(0.01,0.1,1,5,10,100)))
summary(svm_model_radial)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 5 0.1
##
## - best performance: 0.03578947
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 0.01 1e-02 0.53052632 0.13353056
## 2 0.10 1e-02 0.11289474 0.08478831
## 3 1.00 1e-02 0.08210526 0.06103650
## 4 5.00 1e-02 0.07684211 0.05000462
## 5 10.00 1e-02 0.05131579 0.03424986
## 6 0.01 1e-01 0.53052632 0.13353056
## 7 0.10 1e-01 0.08736842 0.06047926
## 8 1.00 1e-01 0.06657895 0.04270498
## 9 5.00 1e-01 0.03578947 0.03415876
## 10 10.00 1e-01 0.03578947 0.03415876
## 11 0.01 1e+00 0.53052632 0.13353056
## 12 0.10 1e+00 0.53052632 0.13353056
## 13 1.00 1e+00 0.08763158 0.05604501
## 14 5.00 1e+00 0.08263158 0.05704048
## 15 10.00 1e+00 0.08263158 0.05704048
## 16 0.01 5e+00 0.53052632 0.13353056
## 17 0.10 5e+00 0.53052632 0.13353056
## 18 1.00 5e+00 0.51026316 0.13107183
## 19 5.00 5e+00 0.51026316 0.13107183
## 20 10.00 5e+00 0.51026316 0.13107183
## 21 0.01 1e+01 0.53052632 0.13353056
## 22 0.10 1e+01 0.53052632 0.13353056
## 23 1.00 1e+01 0.51552632 0.13738898
## 24 5.00 1e+01 0.51026316 0.13107183
## 25 10.00 1e+01 0.51026316 0.13107183
## 26 0.01 1e+02 0.53052632 0.13353056
## 27 0.10 1e+02 0.53052632 0.13353056
## 28 1.00 1e+02 0.53052632 0.13353056
## 29 5.00 1e+02 0.53052632 0.13353056
## 30 10.00 1e+02 0.53052632 0.13353056
From above summaries, we can see that for polynomial model performs best for 0.01 and degree 2and for radial kernal its best for cost 5 and gamma 0.1.
plot(svm_model)
plot(svm_model_polynomial)
plot(svm_model_radial)
len<-2:(ncol(Auto)-2)
svm_mod_lin_1<-svm(mpglevel~.,data=Auto,kernel="linear",cost=5)
svm_mod_pol_1<-svm(mpglevel~.,data=Auto,kernel="polynomial",cost=0.01,degree=2)
svm_mod_rad_1<-svm(mpglevel~.,data=Auto,kernel="radial",cost=5,gamma=0.1)
for(i in len){
plot(svm_mod_lin_1, Auto, as.formula(paste("mpg~",names(Auto)[i])))
}
for(i in len){
plot(svm_mod_pol_1, Auto, as.formula(paste("mpg~",names(Auto)[i])))
}
for(i in len){
plot(svm_mod_rad_1, Auto, as.formula(paste("mpg~",names(Auto)[i])))
}
### Problem 2: In this problem, you will perform K-means clustering manually, with K = 2, on a small example with n = 6 observations and p = 2 features. The observations are as follows.
c2 = c(1:6)
c3 = c(1,1,0,5,6,4)
c4 = c(4,3,4,1,2,0)
library(knitr)
library(kableExtra)
dataSet = data.frame( Observation = c2, X1= c3, X2 = c4)
kable(dataSet) %>% kable_styling(bootstrap_options = c("striped", "hover"), font_size = 10.5 , position = "center")
| Observation | X1 | X2 |
|---|---|---|
| 1 | 1 | 4 |
| 2 | 1 | 3 |
| 3 | 0 | 4 |
| 4 | 5 | 1 |
| 5 | 6 | 2 |
| 6 | 4 | 0 |
plot(c3, c4, type="p", main = "Scatterplot")
set.seed(124564)
obs_data <- cbind(c3,c4)
clust_label <- sample(x = 2, size = nrow(obs_data), replace = TRUE)
clust_label
## [1] 1 1 2 1 2 1
plot(obs_data[clust_label == 1, 1], obs_data[clust_label == 1, 2], col = 4, pch = 20, cex = 3,
xlim = c(0, 6))
points(obs_data[clust_label == 2, 1], obs_data[clust_label == 2, 2], col = 3, pch = 20, cex = 3)
plot(obs_data[clust_label == 1, 1], obs_data[clust_label == 1, 2], col = 4, pch = 20, cex = 3,
xlim = c(0, 6))
points(obs_data[clust_label == 2, 1], obs_data[clust_label == 2, 2], col = 3, pch = 20, cex = 3)
centroid1 <- c(mean(obs_data[clust_label == 1, 1]), mean(obs_data[clust_label == 1, 2]))
centroid2 <- c(mean(obs_data[clust_label == 2, 1]), mean(obs_data[clust_label == 2, 2]))
centroid1
## [1] 2.75 2.00
centroid2
## [1] 3 3
points(centroid1[1], centroid1[2], col = 4, pch = 4)
points(centroid2[1], centroid2[2], col = 3, pch = 4)
distance <- function (x, y){
return(sqrt((x[1] - y[1])^2 + (x[2] - y[2])^2))
}
# For observation 1:
distance(obs_data[1,], centroid1)
## c3
## 2.657536
distance(obs_data[1,], centroid2)
## c3
## 2.236068
clust_label[1] <- 2
# For observation 2:
distance(obs_data[2,], centroid1)
## c3
## 2.015564
distance(obs_data[2,], centroid2)
## c3
## 2
clust_label[2] <- 2
# For observation 3:
distance(obs_data[3,], centroid1)
## c3
## 3.400368
distance(obs_data[3,], centroid2)
## c3
## 3.162278
clust_label[3] <-2
# For observation 4:
distance(obs_data[4,], centroid1)
## c3
## 2.462214
distance(obs_data[4,], centroid2)
## c3
## 2.828427
clust_label[4] <- 1
# For observation 5:
distance(obs_data[5,], centroid1)
## c3
## 3.25
distance(obs_data[5,], centroid2)
## c3
## 3.162278
# This observation is closer to centroid 2.
clust_label[5] <- 2
# For observation 6:
distance(obs_data[6,], centroid1)
## c3
## 2.358495
distance(obs_data[6,], centroid2)
## c3
## 3.162278
clust_label[6] <- 1
plot(obs_data[clust_label == 1, 1], obs_data[clust_label == 1, 2], col = 4, pch = 20, cex = 3,
xlim = c(0, 6), ylim = c(0,6))
points(obs_data[clust_label == 2, 1], obs_data[clust_label == 2, 2],
col = 3, pch = 20, cex = 3)
centroid1 <- c(mean(obs_data[clust_label == 1, 1]), mean(obs_data[clust_label == 1, 2]))
centroid2 <- c(mean(obs_data[clust_label == 2, 1]), mean(obs_data[clust_label == 2, 2]))
points(centroid1[1], centroid1[2], col = 4, pch = 4)
points(centroid2[1], centroid2[2], col = 3, pch = 4)
obs <- obs_data
# For observation 1:
distance(obs[1,], centroid1)
## c3
## 4.949747
distance(obs[1,], centroid2)
## c3
## 1.25
# This observation is closer to centroid 1.
clust_label[1] <- 2
# For observation 2:
distance(obs[2,], centroid1)
## c3
## 4.301163
distance(obs[2,], centroid2)
## c3
## 1.030776
clust_label[2] <- 2
# For observation 3:
distance(obs[3,], centroid1)
## c3
## 5.700877
distance(obs[3,], centroid2)
## c3
## 2.136001
# This observation is closer to centroid 1.
clust_label[3] <- 2
# For observation 4:
distance(obs[4,], centroid1)
## c3
## 0.7071068
distance(obs[4,], centroid2)
## c3
## 3.75
# This observation is closer to centroid 2.
clust_label[4] <- 1
# For observation 5:
distance(obs[5,], centroid1)
## c3
## 2.12132
distance(obs[5,], centroid2)
## c3
## 4.190764
clust_label[5] <- 1
# For observation 6:
distance(obs[6,], centroid1)
## c3
## 0.7071068
distance(obs[6,], centroid2)
## c3
## 3.816084
clust_label[6] <- 1
plot(obs[clust_label == 1, 1], obs[clust_label == 1, 2], col = 4, pch = 20, cex = 3,
xlim = c(0, 6), ylim = c(0, 6))
points(obs[clust_label == 2, 1], obs[clust_label == 2, 2],
col = 3, pch = 20, cex = 3)
centroid1 <- c(mean(obs[clust_label == 1, 1]), mean(obs[clust_label == 1, 2]))
centroid2 <- c(mean(obs[clust_label == 2, 1]), mean(obs[clust_label == 2, 2]))
points(centroid1[1], centroid1[2], col = 4, pch = 4)
points(centroid2[1], centroid2[2], col = 5, pch = 4)
# For observation 1:
distance(obs[1,], centroid1)
## c3
## 5
distance(obs[1,], centroid2)
## c3
## 0.4714045
# This observation is closer to centroid 1.
clust_label[1] <- 2
# For observation 2:
distance(obs[2,], centroid1)
## c3
## 4.472136
distance(obs[2,], centroid2)
## c3
## 0.745356
clust_label[2] <- 2
# For observation 3:
distance(obs[3,], centroid1)
## c3
## 5.830952
distance(obs[3,], centroid2)
## c3
## 0.745356
# This observation is closer to centroid 1.
clust_label[3] <- 2
# For observation 4:
distance(obs[4,], centroid1)
## c3
## 0
distance(obs[4,], centroid2)
## c3
## 5.088113
# This observation is closer to centroid 2.
clust_label[4] <- 1
# For observation 5:
distance(obs[5,], centroid1)
## c3
## 1.414214
distance(obs[5,], centroid2)
## c3
## 5.587685
clust_label[5] <- 1
# For observation 6:
distance(obs[6,], centroid1)
## c3
## 1.414214
distance(obs[6,], centroid2)
## c3
## 4.955356
clust_label[6] <- 1
plot(obs[clust_label == 1, 1], obs[clust_label == 1, 2], col = 4, pch = 20, cex = 3,
xlim = c(0, 6), ylim = c(0, 6))
points(obs[clust_label == 2, 1], obs[clust_label == 2, 2],
col = 3, pch = 20, cex = 3)
centroid1 <- c(mean(obs[clust_label == 1, 1]), mean(obs[clust_label == 1, 2]))
centroid2 <- c(mean(obs[clust_label == 2, 1]), mean(obs[clust_label == 2, 2]))
points(centroid1[1], centroid1[2], col = 4, pch = 4)
points(centroid2[1], centroid2[2], col = 5, pch = 4)
```
Clusters stopped changing at this point
plot(obs[clust_label == 1, 1], obs[clust_label == 1, 2], col = 4, pch = 20, cex = 3,
xlim = c(0, 6), ylim = c(0, 4))
points(obs[clust_label == 2, 1], obs[clust_label == 2, 2],
col = 3, pch = 20, cex = 3)
set.seed(1234)
data_prob_3<-matrix(sapply(1:3,function(y){rnorm(20*50,mean = 20*sqrt(y))}),ncol=50)
data_class<-unlist(lapply(1:3, function(y){rep(y,20)}))
pca_op_3<-prcomp(data_prob_3)
plot(pca_op_3$x[,c(1:2)],col=data_class,pch=19)
From the above plot, we can see that the three classes appear tobe separated, hence, we can go ahead and use kmeans to cluster the observations.
actual_class<-data_class
set.seed(123)
kmeans_cluster<-kmeans(data_prob_3,3)
table(actual_class,kmeans_cluster$cluster)
##
## actual_class 1 2 3
## 1 0 20 0
## 2 20 0 0
## 3 0 0 20
kmeans_cluster_sets<-kmeans(data_prob_3,3,nstart = 10)
table(actual_class,kmeans_cluster_sets$cluster)
##
## actual_class 1 2 3
## 1 20 0 0
## 2 0 20 0
## 3 0 0 20
plot(pca_op_3$x[,c(1:2)],col=kmeans_cluster_sets$cluster,pch=19)
From the above summary we can see that with nstart=1 (single set) some observations were not clusetered as expected and their was some overlap, however, with nstart=10, we can see that the data points are clustered perfectly(also visible from the plot).
set.seed(123)
#------k=2 without nstart
kmeans_cluster_k2<-kmeans(data_prob_3,2)
table(actual_class,kmeans_cluster_k2$cluster)
##
## actual_class 1 2
## 1 0 20
## 2 0 20
## 3 20 0
#-----------Clustering with nstart=10
kmeans_cluster_k2_sets<-kmeans(data_prob_3,2,nstart=10)
table(actual_class,kmeans_cluster_k2_sets$cluster)
##
## actual_class 1 2
## 1 0 20
## 2 0 20
## 3 20 0
plot(pca_op_3$x[,c(1:2)],col=kmeans_cluster_k2_sets$cluster,pch=19)
From the above summaries and plot, we can see that on using k=2, all the data points in cluster 1 and cluster 2 have been absorbed in a single cluster whereas cluster 3 remains as it was earlier.
#------k=4 without nstart
kmeans_cluster_k4<-kmeans(data_prob_3,4)
table(actual_class,kmeans_cluster_k4$cluster)
##
## actual_class 1 2 3 4
## 1 10 0 0 10
## 2 0 0 20 0
## 3 0 20 0 0
#-----------Clustering with nstart=10
kmeans_cluster_k4_sets<-kmeans(data_prob_3,4,nstart=10)
table(actual_class,kmeans_cluster_k4_sets$cluster)
##
## actual_class 1 2 3 4
## 1 0 0 0 20
## 2 0 0 20 0
## 3 16 4 0 0
plot(pca_op_3$x[,c(1:2)],col=kmeans_cluster_k4_sets$cluster,pch=19)
From the above summaries and plot, we can see that cluster 1 is splitted into two clusters whereas cluster 2 and 3 remain the same.
#------k=3 on first two principal components--------
kmeans_cluster_pca_1_2<-kmeans(pca_op_3$x[,c(1:2)],3)
table(actual_class,kmeans_cluster_pca_1_2$cluster)
##
## actual_class 1 2 3
## 1 0 0 20
## 2 20 0 0
## 3 0 20 0
#------k=3 with nstart=10 on first two principal components--------
kmeans_cluster_pca_1_2_sets<-kmeans(pca_op_3$x[,c(1:2)],3,nstart = 10)
table(actual_class,kmeans_cluster_pca_1_2_sets$cluster)
##
## actual_class 1 2 3
## 1 0 0 20
## 2 0 20 0
## 3 20 0 0
plot(pca_op_3$x[,c(1:2)],col=kmeans_cluster_pca_1_2_sets$cluster,pch=19)
From the above summaries and plot, we can say that the algorithm performs good on the first two principal components as the observations are perfectly clustered.
kmeans_cluster_scaled<-kmeans(scale(data_prob_3,center = T,scale = T),3,nstart=5)
table(actual_class,kmeans_cluster_scaled$cluster)
##
## actual_class 1 2 3
## 1 0 12 8
## 2 4 9 7
## 3 9 4 7
plot(pca_op_3$x[,c(1:2)],col=kmeans_cluster_scaled$cluster,pch=19)
From the above summary and plot, we see that their is some overlap in all the clusters, hence, the clustering algorithm is performing poorly because scaling affects the distance between the data points.